# importing necessary libraries for plotting and reading files
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
tqdm.pandas()
# Since the csv file is very large in size, to save memory we are only reading the columns which might be necessary for our analysis
df = pd.read_csv('./data/metadata.csv', usecols = ['source_x', 'title', 'abstract', 'journal'], low_memory = True)
df.head()
# As instructed, the first source is selected as the source for the papers which have multiple sources mentioned
df['source'] = df['source_x'].apply(lambda x: x.split(';')[0])
df.head()
# Finding the top 10 most frequent journals
most_frequent_journals = df['journal'].value_counts()[:10].index
most_frequent_journals
# Plotting distribution of papers across the top 10 most frequent journals. Y-axis is log scaled for better visualization.
plot_df = df[df['journal'].isin(most_frequent_journals)]
results = pd.crosstab(plot_df['source'], plot_df['journal'])
plt.rcParams.update({'font.size': 14})
rcParams['figure.figsize'] = 25, 10
results.plot.bar(logy=True)
plt.show()
The chart above is generated by using the crosstab method of pandas. Crosstab us a simple cross tabulation of two (or more) factors. Here the two factors are source and journal. The X-axis contains the name of various sources and each source can have multiple journals. Each of the journal is signified by a color which can be understood by looking at the color legend in the figure. From the chart, we can deduce that WHO is the highest source of producing journals with WHO contributing a wide range of journals. PMC and Mediline aren't far behind. MedRxiv seems to publish the least amount of journals.
# import libraries and modules for visualizations, clustering and text preprocessing
import re
import string
from langdetect import detect
from scipy.spatial.distance import cdist
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
punctuations = string.punctuation
stopwords = list(stopwords.words('english'))
lmtzr = WordNetLemmatizer()
# Removing all rows which have null values
plot_df = df[df['abstract'].notna()]
plot_df.dropna(inplace=True)
Due to memory restrictions of the system we are working on, we will limit our study to taking 10000 documents which are randomly sampled from our dataset.
# Sampling 10000 random rows due to memory limitation in system.
plot_df = plot_df.sample(10000, random_state=2020)
# To the stop words list, we add a few more stop words of our own which occur too frequently in almost all the papers
custom_stop_words = [
'cell', 'virus', 'infection', 'study', 'using', 'disease', 'article', 'doi','used', 'may', 'also', 'based',
'covid', 'conclusion', 'result', 'however', 'yet', 'pandemic', 'cov', 'health', 'method', 'patient', 'patients'
]
for w in custom_stop_words:
if w not in stopwords:
stopwords.append(w)
# Function to detect language of a text
def detect_language(text):
try:
lang = detect(text)
return lang
except:
return 'unk'
We will restrict our study to only those papers written in english. Mixing up papers of different languages might lead to unexpected results. Hence we will filter out all papers whose language is not english. Besides this, we will apply some text preprocessing to all the abstract texts of the papers. This will help us filter out unncessary and unimportant words or terms that might affect the quality of results.
# Detecing language of each abstract text and filtering all rows with language other than english
plot_df['language'] = plot_df['abstract'].progress_apply(lambda x: detect_language(x))
plot_df = plot_df[plot_df['language'] == 'en']
# Except for alphanumeric terms, filtering out all other terms
plot_df['tokenized_col'] = plot_df.progress_apply(lambda row: (re.sub("[^A-Za-z0-9' ]+", ' ', row['abstract'])),axis=1)
# Filtering out purely numeric terms as they don't help much in our analysis and can lead to unexpected behaviour
plot_df['tokenized_col'] = plot_df.progress_apply(lambda row: (re.sub("^\d+\s|\s\d+\s|\s\d+$", ' ', row['tokenized_col'])),axis=1)
# Lowercasing all the abstract texts
plot_df['tokenized_col'] = plot_df.progress_apply(lambda row: row['tokenized_col'].lower(), axis = 1)
# Tokenizing each of the abstract texts into separate terms
plot_df['tokenized_col'] = plot_df.progress_apply(lambda row: (word_tokenize(row['tokenized_col'])), axis = 1)
# Lemmatizing each word to convert all words to its root word wherever possible
plot_df['tokenized_col'] = plot_df.progress_apply(lambda row: ([lmtzr.lemmatize(w) for w in row['tokenized_col']]), axis=1)
# Filtering out all stopwords and also those terms with length less than 3
plot_df['tokenized_col'] = plot_df.progress_apply(lambda row: ([w for w in row['tokenized_col'] if w not in stopwords and len(w) >= 3]), axis=1)
# Joining all the separated tokens together into a different column for future use
plot_df['col'] = plot_df.progress_apply(lambda row: ' '.join(row['tokenized_col']), axis=1)
# We will vectorize the abstract texts using Tfidf vectorizer. The max_features (or vector dimension)
# is kept at 5000 to account for memory limiation issues.
text = plot_df['col'].values
max_features = 5000
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
X = tfidf_vectorizer.fit_transform(text)
X.shape
# Using PCA, we will reduce the dimensions of the vectors. This will make clustering of the texts by the k-means
# algorithm more memory efficient
pca = PCA(n_components=0.95, random_state=42, copy=False)
X_reduced = pca.fit_transform(X.toarray())
X_reduced.shape
We can see here that the dimension of the vectors got reduced from 5000 to 3215 after we apply Principal Component Analysis (PCA) on it.
Next we will apply clustering on the vectorized data. For this, we will use the K-means algorithm. The K-means algorithm requires the number of clusters(k) to be generated as an user input. So, to find the optimal number of clusters, we will use the elbow method. In the Elbow method, we are actually varying the number of clusters ( K ) from 10 – 25. For each value of K, we are calculating the WCSS ( Within-Cluster Sum of Square ). WCSS is the sum of squared distance between each point and the centroid in a cluster. When we plot the WCSS with the K value, the plot looks like an Elbow. As the number of clusters increases, the WCSS value will start to decrease. WCSS value is largest when K = 1. When we analyze the graph we can see that the graph will rapidly change at a point and thus creating an elbow shape. The value of K where the elbow shape is generated can be considered as an optimal value for k to be used in K-means.
# run kmeans with many different values of k
distortions = []
K = range(10, 25)
for k in tqdm(K):
k_means = KMeans(n_clusters=k, random_state=42).fit(X_reduced)
k_means.fit(X_reduced)
distortions.append(sum(np.min(cdist(X_reduced, k_means.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]
# Plot the elbow curve to find optimal value of k
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xticks(range(10, 25))
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
Looking at the graph, it is safe to say that k=13 looks like a good choice as there is a definite elbow present at k=13. For our case, we will choose 13 as our optimal number of clusters while performing clustering on the abstract texts.
k = 13
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
plot_df['cluster'] = y_pred
Next we need to visualize the clusters that are generated by K-means. However, the current dimension of the vectors is huge and we cannot visualize anything with a dimension higher than 3. So, for proper visualization of the clusters, we will reduce the dimensions such that we can plot the results in a 2D graph. For this, we use t-SNE. t-SNE will attempt to preserve the relations of the higher dimensional data as closely as possible when shrunk to 2D.
tsne = TSNE(verbose=1, perplexity=40)
X_embedded = tsne.fit_transform(X.toarray())
# sns settings
sns.set(rc={'figure.figsize':(13,9)})
# colors
palette = sns.hls_palette(k, l=.4, s=.9)
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title('t-SNE with K-means')
plt.show()
Each of the marker in the plot above signifies a document and the colour the marker represents the cluster it belongs to. K-means puts the document vectors with close euclidean distance near to each other. We can see that in most cases, documents of the same cluster are very closely grouped with each other.
# import all libraries necessary to generate and plot wordclouds
import random
from textwrap import wrap
from wordcloud import WordCloud
for w in custom_stop_words:
if w not in stopwords:
stopwords.append(w)
For each of the clusters generated by K-means, we will generate a word cloud. The size of the words or terms increases with the frequency of the word in texts. That means the most frequently occuring words will have a larger size as compared to the words which occur sparsely.
# Function to set color scheme for the wordclouds
def color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
return "hsl(0, {}%, 50%)".format(random.randint(0, 50))
# Function for generating word clouds
def generate_wordcloud(data, cluster):
wc = WordCloud(background_color="black", max_words=2000, stopwords=stopwords, width=800, height=600)
wc.generate(" ".join(data))
wc.recolor(color_func=color_func, random_state=42)
plt.figure(figsize=(15,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title('\n'.join(wrap('Cluster: ' + str(cluster),60)), fontsize=13)
plt.show()
# Plotting word cloud for each cluster
for i in range(0, k):
word_data = plot_df[plot_df['cluster'] == i]['col'].values
generate_wordcloud(word_data, i)
# import gensim and pyLDAvis for LDA topic modelling and visualization
import gensim
import gensim.corpora as corpora
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
from pyLDAvis import gensim_models
Next we will perform topic modelling to generate topics from our data and also visualize the topic modelling. We will use Latent Discrminant Analysis (LDA) to perform topic modelling.
Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring. Gensim’s Phrases model can build and implement the bigrams, trigrams, quadgrams and more. The two important arguments to Phrases are min_count and threshold. The higher the values of these param, the harder it is for words to be combined to bigrams.
data_words = plot_df['tokenized_col'].values
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])
The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. We will create them first.
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])
Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency). For example, (0, 2) above implies, word id 0 occurs twice in the first document. Likewise, word id 2 occurs once and so on.
This is used as the input by the LDA model. If we want to see what word a given id corresponds to, we can pass the id as a key to the dictionary.
id2word[0]
We have everything required to train the LDA model. In addition to the corpus and dictionary, we need to provide the number of topics as well. For this, we will need to find the optimal value for number of topics.
Generally, topic coherence provides a convenient measure to judge how good a given topic model is. Our approach to finding the optimal number of topics here is to build many LDA models with different values of number of topics (k) and pick the one that gives the highest coherence value.
Choosing a ‘k’ that marks the end of a rapid growth of topic coherence usually offers meaningful and interpretable topics. Picking an even higher value can sometimes provide more granular sub-topics.
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
"""
Compute c_v coherence for various number of topics
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : List of input texts
limit : Max num of topics
Returns:
-------
model_list : List of LDA topic models
coherence_values : Coherence values corresponding to the LDA model with respective number of topics
"""
coherence_values = []
model_list = []
for num_topics in tqdm(range(start, limit, step)):
model = gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word,
random_state=42, update_every=1, passes=10, alpha='auto',
per_word_topics=True)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus,
texts=data_words, start=2, limit=30, step=4)
# Show graph
limit = 30
start = 2
step = 4
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
# Print the coherence scores
for m, cv in zip(x, coherence_values):
print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
With the coherence score seems to keep increasing with the number of topics, it may make better sense to pick the model that gave the highest CV before flattening out or a major drop. In this case, we pick K=14.
optimal_model = model_list[3]
Now that the LDA model is built, the next step is to examine the produced topics and the associated keywords. We can use pyLDAvis package’s interactive chart for this. It is designed to work well with jupyter notebooks.
pyLDAvis.enable_notebook()
vis = gensim_models.prepare(optimal_model, corpus, id2word)
vis
Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic. A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
If we move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic. We have successfully built a good looking topic model.
One of the practical application of topic modeling is to determine what topic a given document is about. To find that, we find the topic number that has the highest percentage contribution in that document.
The format_topics_sentences() function below nicely aggregates this information in a presentable table.
def format_topics_sentences(ldamodel, corpus, texts):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=plot_df['abstract'].values)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Show
df_dominant_topic
Sometimes just the topic keywords may not be enough to make sense of what a topic is about. So, to help with understanding the topic, we can find the documents a given topic has contributed to the most and infer the topic by reading that document.
# Group top 5 sentences under each topic
sent_topics_sorted = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
sent_topics_sorted = pd.concat([sent_topics_sorted, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
axis=0)
# Reset Index
sent_topics_sorted.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorted.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
# Show
sent_topics_sorted
The tabular output above has 14 rows, one each for a topic. It has the topic number, the keywords, and the most representative document. The Perc_Contribution column is nothing but the percentage contribution of the topic in the given document.
df_dominant_topic['cluster'] = plot_df['cluster'].values
Next we will see that for each topic, which are the most representative clusters?
grouped = df_dominant_topic.groupby(['Dominant_Topic'])['cluster'].value_counts(ascending=False, normalize=True)
next_topic = 1
print("Topic 0: ")
for i, j in zip(grouped.index, grouped.values):
current_topic = i[0]
if current_topic != next_topic:
pass
else:
print(f"Topic {int(i[0])}: ")
next_topic += 1
print(f"\tCluster: {int(i[1])}: {round(j*100, 2)}%")
print()
Here we can see that 36.96% of the documents in Topic 0 belong to Cluster 4. Similarly, for Topic 1, 63.38% of the documents belong to Cluster 3. Accordingly all the distribution of the clusters with each topic is shown.
Next we will see that for each cluster, which are the most representative topics?
grouped = df_dominant_topic.groupby(['cluster'])['Dominant_Topic'].value_counts(ascending=False, normalize=True)
next_cluster = 1
print("Cluster 0: ")
for i, j in zip(grouped.index, grouped.values):
current_cluster = i[0]
if current_cluster != next_cluster:
pass
else:
print(f"Cluster {i[0]}: ")
next_cluster += 1
print(f"\tTopic: {int(i[1])}: {round(j*100, 2)}%")
print()
Here we can see that 91.67% of the documents that belong to Cluster 0 are in Topic 6. Similarly, for Cluster 12, 28.96% of the documents belong to Topic 7. Accordingly all the distribution of the topics with each cluster is shown.
def top_kw_clusters(cluster_id, top_n=10):
temp_df = plot_df[plot_df['cluster'] == cluster_id]
kws = list(pd.Series(' '.join(temp_df['col']).lower().split()).value_counts()[:top_n].index)
return kws
topics_kws = list()
for i in range(14):
print(f"Top 20 keywords in Topic {i}:")
topic_kws = [id2word[tup[0]] for tup in optimal_model.get_topic_terms(i, 20)]
print(topic_kws)
topics_kws.append(topic_kws)
print()
print()
clusters_kws = list()
for i in range(k):
print(f"Top 20 most commonly occuring words in cluster {i}:")
cluster_kws = top_kw_clusters(i, top_n=20)
print(cluster_kws)
clusters_kws.append(cluster_kws)
print()
print()
kw_clusters_df = pd.DataFrame(clusters_kws).transpose()
kw_clusters_df.columns =[f'Cluster {i}' for i in range(k)]
kw_clusters_df
kw_topics_df = pd.DataFrame(topics_kws).transpose()
kw_topics_df.columns =[f'Topic {i}' for i in range(14)]
kw_topics_df
We can compare the two dataframes above and clearly see which clusters are very similar to which topics by looking at the keywords alone. In each of the dataframes above, each column denotes a cluster or a topic and each row denotes a keyword that belongs to that cluster/topic.